Packages and data

library(tidyverse)
library(tidymodels)
library(epidatasets)
library(epidatr)
library(epiprocess)
library(epipredict)

#set_cache(cache_dir = 'epi_cache', days = 30)
theme_set(theme_bw())

Download finalized COVID cases and deaths for California.

cases <- pub_covidcast(
  source = "jhu-csse",
  signals = "confirmed_incidence_num",
  time_type = "day",
  geo_type = "state",
  time_values = epirange(20200401, 20220101),
  geo_values = "ca") %>%
  select(geo_value, time_value, cases = value)

deaths <- pub_covidcast(
  source = "jhu-csse",
  signals = "deaths_incidence_num",
  time_type = "day",
  geo_type = "state",
  time_values = epirange(20200401, 20220101),
  geo_values = "ca") %>%
  select(geo_value, time_value, deaths = value)

Pre-process and visualize data

Join the dataframes and create epi_df.

ca <- left_join(cases, deaths, by = c("time_value", "geo_value")) %>%
  as_epi_df()

ca
## An `epi_df` object, 641 x 4 with metadata:
## * geo_type  = state
## * time_type = day
## * as_of     = 2024-10-01 17:21:57.022087
## 
## # A tibble: 641 × 4
##    geo_value time_value cases deaths
##  * <chr>     <date>     <dbl>  <dbl>
##  1 ca        2020-04-01  1254     29
##  2 ca        2020-04-02  1499     37
##  3 ca        2020-04-03  1324     40
##  4 ca        2020-04-04   746     17
##  5 ca        2020-04-05  1655     50
##  6 ca        2020-04-06  1510     28
##  7 ca        2020-04-07  1097     53
##  8 ca        2020-04-08  1480     63
##  9 ca        2020-04-09   960     52
## 10 ca        2020-04-10  1216     38
## # ℹ 631 more rows

Scale cases and deaths by population. State population data is available inside {epipredict} as state_census.

ca <- left_join(
  x = ca, 
  y = state_census %>% select(pop, abbr), 
  by = c("geo_value" = "abbr"))

ca <- ca %>%
  mutate(cases = cases / pop * 1e5, # cases / 100K
         deaths = deaths / pop * 1e5) %>% # deaths / 100K
  select(-pop)

Now, use epi_slide(), to calculate trailing 7 day averages of cases and deaths.

ca <- ca %>%
  epi_slide(cases = mean(cases), before = 6) %>%
  epi_slide(deaths = mean(deaths), before = 6) 

Visualize the data.

ca %>% 
  pivot_longer(cols = c(cases, deaths), names_to = 'Signal') %>%
  ggplot(aes(time_value, value, col = Signal)) +
  geom_line() +
  xlab('Date') +
  ylab('Rates (per 100k people)') +
  facet_wrap(~Signal, scales = 'free') +
  theme(legend.position = "none")

Notice that some of the COVID death rates are below 0. Let’s impute these values to be 0.

ca$deaths[ca$deaths < 0] <- 0

Overlay cases and deaths on the same plot.

# Handy function to produce a transformation from one range to another
trans = function(x, from_range, to_range) {
  (x - from_range[1]) / (from_range[2] - from_range[1]) *
    (to_range[2] - to_range[1]) + to_range[1]
}

# Compute ranges of the two signals, and transformations in b/w them
range1 = ca %>% select(cases) %>% range()
range2 = ca %>% select(deaths) %>% range()
trans12 = function(x) trans(x, range1, range2)
trans21 = function(x) trans(x, range2, range1)

ggplot(ca %>% 
         mutate(deaths = trans21(deaths)) %>%
         pivot_longer(cols = c(cases, deaths), names_to = 'name'),
       aes(x = time_value, y = value)) + 
  geom_line(aes(color = name)) +
  scale_color_manual(values = palette()[c(2,4)]) +
  scale_y_continuous(
    name = "Reported Covid-19 cases per 100k people", 
    limits = range1,
    sec.axis = sec_axis(
      trans = trans12, 
      name = "Reported Covid-19 deaths per 100k people")) +
  labs(title = "Covid-19 cases and deaths in California", x = "Date") +
  theme(legend.position = "bottom", legend.title = element_blank())
## Warning: The `trans` argument of `sec_axis()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Split data into training (before 2021-03-01) and testing set (after 2021-03-01).

t0_date <- as.Date('2021-03-01')

train <- ca %>% filter(time_value <= t0_date)
test <- ca %>% filter(time_value > t0_date)

t0 <- nrow(train)

Choose and create lagged predictor

Check which lag leads to highest correlation between cases and deaths on the training set.

# look at cases and deaths (where we move deaths forward by 1, 2, ..., 35 days)
lags = 1:35
cor_deaths_cases <- lapply(lags, 
                             function(x) epi_cor(train, deaths, cases, 
                                                 cor_by = geo_value, dt1 = x))

cor_deaths_cases <- list_rbind(cor_deaths_cases, names_to = 'Lag') 

# best lag
k <- which.max(cor_deaths_cases$cor)

cor_deaths_cases %>%
  ggplot(aes(Lag, cor)) +
  geom_point() +
  geom_line() +
  labs(x = "Lag", y = "Correlation") +
  geom_vline(xintercept = k) +
  ggtitle('Correlation between cases and deaths by lag')

We will use lagged COVID cases to predict COVID deaths. COVID cases will be lagged by 26.

ca$lagged_cases <- dplyr::lag(ca$cases, n = k)
train <- ca %>% filter(time_value <= t0_date)
test <- ca %>% filter(time_value > t0_date)

Let’s plot COVID deaths versus lagged COVID cases to check that the relationship is approximately linear.

ggplot(train, aes(lagged_cases, deaths)) + 
  geom_point(alpha = .5) +
  labs(x = "Lagged cases", y = "Deaths")
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).

Lagged linear regression

Perform linear regression of deaths on lagged cases using training data.

reg_lagged = lm(deaths ~ lagged_cases, data = train)
coef(reg_lagged)
##  (Intercept) lagged_cases 
##   0.09533276   0.01134891

Plot again COVID deaths versus lagged COVID cases with regression line.

ggplot(train, aes(lagged_cases, deaths)) +
  geom_point(alpha = .5) +
  geom_abline(intercept = coef(reg_lagged)[1], slope = coef(reg_lagged)[2],
              col = 'red') +
  labs(x = "Lagged Cases", y = "Deaths") +
  ggtitle("Deaths vs cases (lagged by 26 days) with regression line")
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).

Training error

Let’s compute the MSE, MAE, MAPE, and MASE on the training set.

MSE <- function(truth, prediction) {
  mean((truth - prediction)^2)}

MAE <- function(truth, prediction) {
  mean(abs(truth - prediction))}

MAPE <- function(truth, prediction) {
  100 * mean(abs(truth - prediction) / truth)}

MASE <- function(truth, prediction) {
  100 * MAE(truth, prediction) / mean(abs(diff(truth)))}
pred_train <- predict(reg_lagged)
train$pred_train <- c(rep(NA, k), pred_train)

errors <- data.frame("MSE" = MSE(train$deaths[-(1:k)], pred_train), 
                     "MAE"= MAE(train$deaths[-(1:k)], pred_train), 
                     "MAPE" = MAPE(train$deaths[-(1:k)], pred_train), 
                     "MASE" = MASE(train$deaths[-(1:k)], pred_train), 
                     row.names = "training")
errors
##                  MSE        MAE     MAPE     MASE
## training 0.008094702 0.05294347 17.82704 311.8417

The predictions on the training set track the observed death rates quite closely.

train %>% 
  mutate(observed = deaths, predicted = pred_train) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "Regression with a lagged predictor: training", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).

Split-sample error

Let’s compute the MSE, MAE, MAPE, and MASE on the test set. As expected, the errors on the test set are much larger than the errors on the training set.

Notice that the MAPE does not have a finite value because there are some 0s in the denominator.

test$pred_test <- predict(reg_lagged, newdata = test)

errors <- rbind(errors, 
                data.frame("MSE" = MSE(test$deaths, test$pred_test), 
                           "MAE"= MAE(test$deaths, test$pred_test), 
                           "MAPE" = MAPE(test$deaths, test$pred_test), 
                           "MASE" = MASE(test$deaths, test$pred_test), 
                           row.names = "split-sample"))
errors
##                      MSE        MAE     MAPE     MASE
## training     0.008094702 0.05294347 17.82704 311.8417
## split-sample 0.016062115 0.10175475      Inf 671.1406

Let’s plot the predictions on the test set. They look fine: the model under-predicts at the beginning, and then over-predicts for the rest of the time.

test %>% 
  mutate(observed = deaths, predicted = pred_test) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "Regression with a lagged predictor: test", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())


AFTERNOON

We can also perform split-sample using {epipredict}.

head(test)
## An `epi_df` object, 6 x 6 with metadata:
## * geo_type  = state
## * time_type = day
## * as_of     = 2024-10-01 17:21:57.022087
## 
## # A tibble: 6 × 6
##   geo_value time_value cases deaths lagged_cases pred_test
## * <chr>     <date>     <dbl>  <dbl>        <dbl>     <dbl>
## 1 ca        2021-03-02  12.5  1.06          39.6     0.545
## 2 ca        2021-03-03  11.9  0.767         37.5     0.520
## 3 ca        2021-03-04  11.7  0.733         35.8     0.501
## 4 ca        2021-03-05  11.5  0.700         34.0     0.482
## 5 ca        2021-03-06  11.3  0.745         32.7     0.466
## 6 ca        2021-03-07  11.3  0.694         31.8     0.457
epi_reg_lagged <- arx_forecaster(epi_data = train %>% as_epi_df(), 
                                 outcome = "deaths", 
                                 predictors = "cases", 
                                 trainer = linear_reg() %>% set_engine("lm"),
                                 args_list = arx_args_list(lags = k-1, 
                                                           ahead = 1L))

extract_fit_parsnip(epi_reg_lagged$epi_workflow)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##  (Intercept)  lag_25_cases  
##      0.09533       0.01135

Time-series cross-validation

We will now perform time-series cross-validation fitting the model on:

  • all the past data up to the forecasting time
  • a trailing window of data up to the forecasting time

We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.

n <- nrow(ca)
pred_all_past = pred_trailing <- rep(NA, length = n - t0)
w <- 30 # trailing window size

for (t in (t0+1):n) {
  reg_all_past = lm(deaths ~ lagged_cases, data = ca, 
                    #subset = (1:n) <= (t-k)) 
                    subset = (1:n) <= (t-1)) 
  reg_trailing = lm(deaths ~ lagged_cases, data = ca, 
                    #subset = (1:n) <= (t-k) & (1:n) > (t-k-w)) 
                    subset = (1:n) <= (t-1) & (1:n) > (t-1-w)) 
  pred_all_past[t-t0] = predict(reg_all_past, newdata = data.frame(ca[t, ]))
  pred_trailing[t-t0] = predict(reg_trailing, newdata = data.frame(ca[t, ]))
}

test$pred_cv <- pred_all_past
test$pred_trailing_cv <- pred_trailing

We compute the cross-validated error when using all the past data up to the forecasting time. Since we are refitting the model as new data come in, the error (under all metrics) is slightly smaller than when using a split-sample approach, but is still not as small as when we compute it on the training data.

errors <- rbind(errors, 
                data.frame("MSE" = MSE(test$deaths, pred_all_past), 
                           "MAE"= MAE(test$deaths, pred_all_past), 
                           "MAPE" = MAPE(test$deaths, pred_all_past), 
                           "MASE" =  MASE(test$deaths, pred_all_past), 
                           row.names = "time series CV"))
errors
##                        MSE        MAE     MAPE     MASE
## training       0.008094702 0.05294347 17.82704 311.8417
## split-sample   0.016062115 0.10175475      Inf 671.1406
## time series CV 0.014838552 0.09623533      Inf 634.7363

The cross-validated error obtained when using a trailing window is much smaller than all the other errors previously obtained, under all metrics. This can be explained by the fact that the relationship between the predictor and the outcome changes over time, and therefore using a short recent window of data to get linear regression estimates can improve the predictions.

errors <- rbind(errors, 
                data.frame("MSE" = MSE(test$deaths, pred_trailing), 
                           "MAE"= MAE(test$deaths, pred_trailing), 
                           "MAPE" = MAPE(test$deaths, pred_trailing), 
                           "MASE" = MASE(test$deaths, pred_trailing), 
                           row.names = "time series CV + trailing"))
errors
##                                   MSE        MAE     MAPE     MASE
## training                  0.008094702 0.05294347 17.82704 311.8417
## split-sample              0.016062115 0.10175475      Inf 671.1406
## time series CV            0.014838552 0.09623533      Inf 634.7363
## time series CV + trailing 0.004064001 0.04507775      Inf 297.3179

We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows. The latter tracks the observed deaths quite well.

test %>% 
  mutate(observed = deaths, 
         `predicted (CV)` = pred_cv, 
         `predicted (trailing + CV)` = pred_trailing_cv) %>%
  pivot_longer(cols = c(observed, `predicted (CV)`, `predicted (trailing + CV)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "Regression with a lagged predictor: time-series cross-validation", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())


AFTERNOON

The predictions we obtained by manually lagging cases and using lm within for loops can be equivalently obtained via {epipredict}. The latter gives a more user-friendly and straightforward way to obtain the predictions.

The approach with trailing window is reported next.

# specify the forecast dates
fc_time_values <- seq(
  from = as.Date("2021-03-01"),
  to = as.Date("2021-12-31"),
  by = "1 day"
)

# slide an arx_forecaster with appropriate outcome, predictions, lags, 
# and trailing window
epi_pred_cv_trailing <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = "cases", 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = k-1, ahead = 1L)
                   )$predictions,
  # notice that `before` is not simply equal to w-1. That's because previously, 
  # when considering a window from t to t+w, we had access to y_t, ..., y_{t+w} 
  # and also to x_{t-k}, ..., x_{t+w-k}. (That's because of how we structured 
  # the dataframe after manually lagging x.) So we were cheating by saying that 
  # the trailing window had length w, as its actual size was w+k! 
  before = (w+k-1), 
  ref_time_values = fc_time_values,
  new_col_name = "fc"
)

# they match exactly
head(epi_pred_cv_trailing %>% select(fc_.pred, fc_target_date))
## # A tibble: 6 × 2
##   fc_.pred fc_target_date
##      <dbl> <date>        
## 1    0.934 2021-03-02    
## 2    0.940 2021-03-03    
## 3    0.919 2021-03-04    
## 4    0.895 2021-03-05    
## 5    0.873 2021-03-06    
## 6    0.861 2021-03-07
head(test %>% select(pred_trailing_cv, time_value))
## # A tibble: 6 × 2
##   pred_trailing_cv time_value
##              <dbl> <date>    
## 1            0.934 2021-03-02
## 2            0.940 2021-03-03
## 3            0.919 2021-03-04
## 4            0.895 2021-03-05
## 5            0.873 2021-03-06
## 6            0.861 2021-03-07

The method fitting on all past data up to the forecasting date can be implemented by changing before = Inf in epi_slide.

# slide an arx_forecaster with appropriate outcome, predictions and lags
epi_pred_cv <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = "cases", 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = k-1, ahead = 1L)
                   )$predictions,
  before = Inf, 
  ref_time_values = fc_time_values,
  new_col_name = "fc"
)

# they match exactly
head(epi_pred_cv %>% select(fc_.pred, fc_target_date))
## # A tibble: 6 × 2
##   fc_.pred fc_target_date
##      <dbl> <date>        
## 1    0.545 2021-03-02    
## 2    0.522 2021-03-03    
## 3    0.504 2021-03-04    
## 4    0.485 2021-03-05    
## 5    0.470 2021-03-06    
## 6    0.461 2021-03-07
head(test %>% select(pred_cv, time_value))
## # A tibble: 6 × 2
##   pred_cv time_value
##     <dbl> <date>    
## 1   0.545 2021-03-02
## 2   0.522 2021-03-03
## 3   0.504 2021-03-04
## 4   0.485 2021-03-05
## 5   0.470 2021-03-06
## 6   0.461 2021-03-07

ISSUE

If we modify the algorithms to predict 7-step ahead, the manual and {epipredict} implementations do not match.

n <- nrow(ca)
pred_all_past <- rep(NA, length = n - t0)
n_ahead <- 7

for (t in (t0+1):n) {
    reg_all_past = lm(deaths ~ lagged_cases, data = ca, 
                      subset = (1:n) <= (t-n_ahead)) 
    pred_all_past[t-t0] = predict(reg_all_past, newdata = data.frame(ca[t, ]))
}

test$pred_cv_7 <- pred_all_past
    

fc_time_values_7 <- seq(
  from = as.Date("2021-02-23"),
  to = as.Date("2021-12-25"),
  by = "1 day"
)

epi_pred_cv_7 <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = "cases", 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = k-1, ahead = n_ahead)
                   )$predictions,
  before = Inf, 
  ref_time_values = fc_time_values_7,
  new_col_name = "fc"
)

# they do not match
head(epi_pred_cv_7 %>% select(fc_.pred, fc_target_date))
## # A tibble: 6 × 2
##   fc_.pred fc_target_date
##      <dbl> <date>        
## 1    0.676 2021-03-02    
## 2    0.660 2021-03-03    
## 3    0.624 2021-03-04    
## 4    0.611 2021-03-05    
## 5    0.587 2021-03-06    
## 6    0.573 2021-03-07
head(test %>% select(pred_cv_7, time_value))
## # A tibble: 6 × 2
##   pred_cv_7 time_value
##       <dbl> <date>    
## 1     0.533 2021-03-02
## 2     0.511 2021-03-03
## 3     0.494 2021-03-04
## 4     0.476 2021-03-05
## 5     0.463 2021-03-06
## 6     0.455 2021-03-07
plot(epi_pred_cv$fc_target_date, epi_pred_cv$fc_.pred, type = 'l') +
  lines(test$time_value, test$pred_cv_7, col = 'red')

## integer(0)

AR model

We now consider a different model. We disregard cases, and only use past deaths to predict future deaths. Let’s check which lag leads to largest correlation between deaths and lagged deaths.

lags <- 1:8
auto_cor_deaths <- lapply(lags, 
                          function(x) epi_cor(train, deaths, deaths, 
                                              cor_by = geo_value, dt1 = x))

auto_cor_deaths <- list_rbind(auto_cor_deaths, names_to = 'Lag') 

auto_cor_deaths %>%
  ggplot(aes(Lag, cor)) +
  geom_point() +
  geom_line() +
  labs(x = "Lag", y = "Correlation") +
  ggtitle('Auto-correlation for deaths by lag')

We will consider an AR model where we predict deaths using lagged_deaths which lags the former by 1. That is

\(y_t \approx \beta_0 + \beta_1 y_{t-1}\)

ca$lagged_deaths <- dplyr::lag(ca$deaths, n = 1)
train_ar <- ca %>% filter(time_value <= t0_date)
test_ar <- ca %>% filter(time_value > t0_date)

Let’s plot COVID deaths versus lagged COVID deaths to check that the relationship is approximately linear.

ggplot(train_ar, aes(lagged_deaths, deaths)) + 
  geom_point(alpha = .5) +
  labs(x = "Lagged deaths", y = "Deaths")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Perform linear regression of deaths on lagged deaths using training data.

Note: the intercept is approximately 0 and the coefficient is approximately 1. This means that we are naively predicting the number of deaths tomorrow with the number of deaths observed today.

ar_fit = lm(deaths ~ lagged_deaths, data = train_ar)
coef(ar_fit)
##   (Intercept) lagged_deaths 
##   0.002515034   1.001278147

Plot again COVID deaths versus lagged COVID cases with regression line.

ggplot(train_ar, aes(lagged_deaths, deaths)) +
  geom_point(alpha = .5) +
  geom_abline(intercept = coef(ar_fit)[1], slope = coef(ar_fit)[2],
              col = 'red') +
  labs(x = "Lagged deaths", y = "Deaths") +
  ggtitle("Deaths vs lagged deaths with regression line")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Training error

Let’s compute the MSE, MAE, MAPE, and MASE on the training set.

pred_train <- predict(ar_fit)

train_ar$pred_train <- c(NA, pred_train)

errors_ar <- data.frame("MSE" = MSE(train_ar$deaths[-1], pred_train), 
                        "MAE"= MAE(train_ar$deaths[-1], pred_train), 
                        "MAPE" = MAPE(train_ar$deaths[-1], pred_train), 
                        "MASE" = MASE(train_ar$deaths[-1], pred_train), 
                        row.names = "training")
errors_ar
##                   MSE        MAE     MAPE     MASE
## training 0.0007915624 0.01630727 4.737325 100.1796
train_ar %>% 
  mutate(observed = deaths, predicted = pred_train) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "AR: training", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

Split-sample error

Let’s compute the MSE, MAE, MAPE, and MASE on the test set. Here, the test errors are very similar to the training errors.

Notice that the MAPE does not have a finite value because there are some 0s in the denominator.

test_ar$pred_test <- predict(ar_fit, newdata = test_ar)

errors_ar <- rbind(errors_ar, 
                  data.frame("MSE" = MSE(test_ar$deaths, test_ar$pred_test), 
                             "MAE"= MAE(test_ar$deaths, test_ar$pred_test), 
                             "MAPE" = MAPE(test_ar$deaths, test_ar$pred_test), 
                             "MASE" = MASE(test_ar$deaths, test_ar$pred_test), 
                             row.names = "split-sample"))
errors_ar
##                       MSE        MAE     MAPE     MASE
## training     0.0007915624 0.01630727 4.737325 100.1796
## split-sample 0.0008281702 0.01571932      Inf 103.6794

Let’s plot the predictions on the test set.

test_ar %>% 
  mutate(observed = deaths, predicted = pred_test) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "AR: test", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

Time-series cross-validation

We will now perform time-series cross-validation fitting the model on:

  • all the past data up to the forecasting time
  • a trailing window of data up to the forecasting time

We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.

n <- nrow(ca)
pred_all_past = pred_trailing <- rep(NA, length = n - t0)
w <- 30 # trailing window size

for (t in (t0+1):n) {
  ar_all_past = lm(deaths ~ lagged_deaths, data = ca, 
                    subset = (1:n) <= (t-1)) 
  ar_trailing = lm(deaths ~ lagged_deaths, data = ca, 
                    subset = (1:n) <= (t-1) & (1:n) > (t-1-w)) 
  pred_all_past[t-t0] = predict(ar_all_past, newdata = data.frame(ca[t, ]))
  pred_trailing[t-t0] = predict(ar_trailing, newdata = data.frame(ca[t, ]))
}

test_ar$pred_cv <- pred_all_past
test_ar$pred_trailing_cv <- pred_trailing

We compute the cross-validated error when using all the past data up to the forecasting time. Since we are refitting the model as new data come in, the error (under all metrics) is slightly smaller than when using a split-sample approach, but is still not as small as when we compute it on the training data.

errors_ar <- rbind(errors_ar, 
                data.frame("MSE" = MSE(test_ar$deaths, pred_all_past), 
                           "MAE"= MAE(test_ar$deaths, pred_all_past), 
                           "MAPE" = MAPE(test_ar$deaths, pred_all_past), 
                           "MASE" =  MASE(test_ar$deaths, pred_all_past), 
                           row.names = "time series CV"))
errors_ar
##                         MSE        MAE     MAPE     MASE
## training       0.0007915624 0.01630727 4.737325 100.1796
## split-sample   0.0008281702 0.01571932      Inf 103.6794
## time series CV 0.0008103096 0.01531685      Inf 101.0248

The cross-validated error obtained when using a trailing window is the largest under all metrics. This can be explained by the fact that the the relationship between deaths at two contiguous time points is quite stable over time. Therefore considering a limited window of data does not improve the fit, and leads instead to a fit that is slightly too volatile.

errors_ar <- rbind(errors_ar, 
                data.frame("MSE" = MSE(test_ar$deaths, pred_trailing), 
                           "MAE"= MAE(test_ar$deaths, pred_trailing), 
                           "MAPE" = MAPE(test_ar$deaths, pred_trailing), 
                           "MASE" = MASE(test_ar$deaths, pred_trailing), 
                           row.names = "time series CV + trailing"))
errors_ar
##                                    MSE        MAE     MAPE     MASE
## training                  0.0007915624 0.01630727 4.737325 100.1796
## split-sample              0.0008281702 0.01571932      Inf 103.6794
## time series CV            0.0008103096 0.01531685      Inf 101.0248
## time series CV + trailing 0.0008502810 0.01682578      Inf 110.9773

We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows. They are almost always overlapping.

test_ar %>% 
  mutate(observed = deaths, 
         `predicted (CV)` = pred_cv, 
         `predicted (trailing + CV)` = pred_trailing_cv) %>%
  pivot_longer(cols = c(observed, `predicted (CV)`, `predicted (trailing + CV)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "AR: time-series cross-validation", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())


AFTERNOON

ar_all_past <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = "deaths", 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = 0L, ahead = 1L)
                   )$predictions,
  before = Inf, 
  ref_time_values = fc_time_values,
  new_col_name = "all_past"
) 

ar_trailing <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = "deaths", 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = 0L, ahead = 1L)
                   )$predictions,
  before = w, 
  ref_time_values = fc_time_values,
  new_col_name = "trailing"
)
ar_trailing %>% 
  left_join(ar_all_past, join_by(time_value, geo_value, cases, deaths)) %>%
  mutate(observed = deaths, 
         `predicted (all past)` = all_past_.pred, 
         `predicted (trailing)` = trailing_.pred) %>%
  pivot_longer(cols = c(observed, `predicted (all past)`, `predicted (trailing)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "Observed and predicted Covid-19 deaths on test set (California)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

# Check that manual implementation and epipredict give same results

if(!all(test_ar$time_value == ar_all_past$all_past_target_date) | 
   !all(test_ar$time_value == ar_trailing$trailing_target_date)) {
  print("Error: target dates do not match!")
}

if(!all(test_ar$pred_cv == ar_all_past$all_past_.pred) | 
   !all(test_ar$pred_trailing_cv == ar_trailing$trailing_.pred)) {
  print("Error: predictions do not match!")
}

ARX model

To improve our predictive performance, we could put together the two models considered so far (i.e. linear regression on cases lagged by k = 26, and linear regression on deaths lagged by 1). This will lead us to the following ARX model

\(y_t \approx \beta_0 + \beta_1 y_{t-1} + \beta_2 x_{t-k}\)

train_arx <- ca %>% filter(time_value <= t0_date)
test_arx <- ca %>% filter(time_value > t0_date)

The estimated coefficients for the ARX model are

arx_fit = lm(deaths ~ lagged_deaths + lagged_cases, data = train_arx)
coef(arx_fit)
##   (Intercept) lagged_deaths  lagged_cases 
##  0.0037535320  0.9810095601  0.0002469175

Training error

Let’s compute the MSE, MAE, MAPE, and MASE on the training set.

pred_train <- predict(arx_fit)

train_arx$pred_train <- c(rep(NA, k), pred_train)

errors_arx <- data.frame("MSE" = MSE(train_arx$deaths[-(1:k)], pred_train), 
                        "MAE"= MAE(train_arx$deaths[-(1:k)], pred_train), 
                        "MAPE" = MAPE(train_arx$deaths[-(1:k)], pred_train), 
                        "MASE" = MASE(train_arx$deaths[-(1:k)], pred_train), 
                        row.names = "training")
errors_arx
##                   MSE        MAE     MAPE     MASE
## training 0.0008454054 0.01704495 4.613866 100.3963
train_arx %>% 
  mutate(observed = deaths, predicted = pred_train) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "ARX: training", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).

Split-sample error

Let’s compute the MSE, MAE, MAPE, and MASE on the test set. Here, the test errors are very similar to the training errors.

Notice that the MAPE does not have a finite value because there are some 0s in the denominator.

test_arx$pred_test <- predict(arx_fit, newdata = test_arx)

errors_arx <- rbind(errors_arx, 
                  data.frame("MSE" = MSE(test_arx$deaths, test_arx$pred_test), 
                             "MAE"= MAE(test_arx$deaths, test_arx$pred_test), 
                             "MAPE" = MAPE(test_arx$deaths, test_arx$pred_test), 
                             "MASE" = MASE(test_arx$deaths, test_arx$pred_test), 
                             row.names = "split-sample"))
errors_arx
##                       MSE        MAE     MAPE     MASE
## training     0.0008454054 0.01704495 4.613866 100.3963
## split-sample 0.0007874735 0.01560882      Inf 102.9506

Let’s plot the predictions on the test set.

test_arx %>% 
  mutate(observed = deaths, predicted = pred_test) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "ARX: test", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

Time-series cross-validation

We will now perform time-series cross-validation fitting the model on:

  • all the past data up to the forecasting time
  • a trailing window of data up to the forecasting time

We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.

n <- nrow(ca)
pred_all_past = pred_trailing <- rep(NA, length = n - t0)
w <- 30 # trailing window size

for (t in (t0+1):n) {
  arx_all_past = lm(deaths ~ lagged_deaths + lagged_cases, data = ca, 
                    subset = (1:n) <= (t-1)) 
  arx_trailing = lm(deaths ~ lagged_deaths + lagged_cases, data = ca, 
                    subset = (1:n) <= (t-1) & (1:n) > (t-1-w)) 
  pred_all_past[t-t0] = predict(arx_all_past, newdata = data.frame(ca[t, ]))
  pred_trailing[t-t0] = predict(arx_trailing, newdata = data.frame(ca[t, ]))
}

test_arx$pred_cv <- pred_all_past
test_arx$pred_trailing_cv <- pred_trailing

We compute the cross-validated error when using all the past data up to the forecasting time. Since we are refitting the model as new data come in, the error (under all metrics) is slightly smaller than when using a split-sample approach, but is still not as small as when we compute it on the training data.

errors_arx <- rbind(errors_arx, 
                data.frame("MSE" = MSE(test_arx$deaths, pred_all_past), 
                           "MAE" = MAE(test_arx$deaths, pred_all_past), 
                           "MAPE" = MAPE(test_arx$deaths, pred_all_past), 
                           "MASE" =  MASE(test_arx$deaths, pred_all_past), 
                           row.names = "time series CV"))
errors_arx
##                         MSE        MAE     MAPE     MASE
## training       0.0008454054 0.01704495 4.613866 100.3963
## split-sample   0.0007874735 0.01560882      Inf 102.9506
## time series CV 0.0007658799 0.01561658      Inf 103.0018

The cross-validated error obtained when using a trailing window is again the largest under all metrics.

errors_arx <- rbind(errors_arx, 
                data.frame("MSE" = MSE(test_arx$deaths, pred_trailing), 
                           "MAE"= MAE(test_arx$deaths, pred_trailing), 
                           "MAPE" = MAPE(test_arx$deaths, pred_trailing), 
                           "MASE" = MASE(test_arx$deaths, pred_trailing), 
                           row.names = "time series CV + trailing"))
errors_arx
##                                    MSE        MAE     MAPE     MASE
## training                  0.0008454054 0.01704495 4.613866 100.3963
## split-sample              0.0007874735 0.01560882      Inf 102.9506
## time series CV            0.0007658799 0.01561658      Inf 103.0018
## time series CV + trailing 0.0009794230 0.01816857      Inf 119.8339

We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows.

test_arx %>% 
  mutate(observed = deaths, 
         `predicted (CV)` = pred_cv, 
         `predicted (trailing + CV)` = pred_trailing_cv) %>%
  pivot_longer(cols = c(observed, `predicted (CV)`, `predicted (trailing + CV)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "ARX: time-series cross-validation", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())


AFTERNOON

arx_all_past <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = c("deaths", "cases"), 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = list(0, k-1), 
                                             ahead = 1L)
                   )$predictions,
  before = Inf, 
  ref_time_values = fc_time_values,
  new_col_name = "all_past"
) 

arx_trailing <- epi_slide(
  ca, 
  ~ arx_forecaster(epi_data = .x,
                   outcome = "deaths", 
                   predictors = c("deaths", "cases"), 
                   trainer = linear_reg() %>% set_engine("lm"),
                   args_list = arx_args_list(lags = list(0, k-1), 
                                             ahead = 1L)
                   )$predictions,
  before = (w+k-1), 
  ref_time_values = fc_time_values,
  new_col_name = "trailing"
)
arx_trailing %>% 
  left_join(arx_all_past, join_by(time_value, geo_value, cases, deaths)) %>%
  mutate(observed = deaths, 
         `predicted (all past)` = all_past_.pred, 
         `predicted (trailing)` = trailing_.pred) %>%
  pivot_longer(cols = c(observed, `predicted (all past)`, `predicted (trailing)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value, col = Deaths)) +
  geom_line() + 
  labs(title = "Observed and predicted Covid-19 deaths on test set (California)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

# Check that manual implementation and epipredict give same results

if(!all(test_arx$time_value == arx_all_past$all_past_target_date) | 
   !all(test_arx$time_value == arx_trailing$trailing_target_date)) {
  print("Error: target dates do not match!")
}

if(!all(test_arx$pred_cv == arx_all_past$all_past_.pred)) {
  print("Error: all past predictions do not match!")
}

if (!all(test_arx$pred_trailing_cv == arx_trailing$trailing_.pred)) {
  print("Error: trailing predictions do not match!")
}
## [1] "Error: trailing predictions do not match!"
different <- (test_arx$pred_trailing_cv != arx_trailing$trailing_.pred)

cbind("manual" = test_arx$pred_trailing_cv,
      "epipredict" = arx_trailing$trailing_.pred)[different, ]
##              manual epipredict
##  [1,] -0.0037960635          0
##  [2,] -0.0002207857          0
##  [3,] -0.0087941934          0
##  [4,] -0.0085684000          0
##  [5,] -0.0258439318          0
##  [6,] -0.0128287550          0
##  [7,] -0.0116267034          0
##  [8,] -0.0101503605          0
##  [9,] -0.0094682107          0
## [10,] -0.0091205887          0
## [11,] -0.0106511890          0

Prediction Intervals

So far we focused on point predictions, i.e. we provided one “best” value as our guess for the number of deaths. We will now introduce 95% predictions intervals: each forecast will be accompanied by an interval (i.e. a range of values) that we expect to cover the true number of deaths about 95% of the times.

To get prediction intervals from the previous code, we only need to tweak our call to predict by adding as an input: interval = "prediction", level = 0.95. The output from predict will then be a matrix with first column a point estimate, second column the lower limit of the interval, and third column the upper limit of the interval.

Next, we plot the point predictions on the training and test set together with the prediction intervals (shadowed bands).

pred_test_ci <- predict(arx_fit, newdata = test_arx, 
                        interval = "prediction", level = 0.95)

head(pred_test_ci)
##         fit       lwr       upr
## 1 1.0712145 1.0101736 1.1322555
## 2 1.0483291 0.9872675 1.1093908
## 3 0.7648747 0.7063989 0.8233506
## 4 0.7314587 0.6730736 0.7898439
## 5 0.6984966 0.6402211 0.7567721
## 6 0.7422651 0.6836403 0.8008899
test_arx %>% 
  mutate(observed = deaths, 
         predicted = pred_test_ci[, 1], 
         lower = pred_test_ci[, 2], 
         upper = pred_test_ci[, 3]) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
  geom_line(aes(col = Deaths)) + 
  labs(title = "ARX: point predictions and intervals on training and test sets", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

We can also provide prediction intervals for the cross-validated approach.

Notice that the width of the prediction intervals varies substantially for the trailing window method.

n <- nrow(ca)
pred_all_past = pred_trailing <- matrix(NA, nrow = n - t0, ncol = 3)
w <- 30 # trailing window size

for (t in (t0+1):n) {
  arx_all_past = lm(deaths ~ lagged_deaths + lagged_cases, data = ca, 
                    subset = (1:n) <= (t-1)) 
  arx_trailing = lm(deaths ~ lagged_deaths + lagged_cases, data = ca, 
                    subset = (1:n) <= (t-1) & (1:n) > (t-1-w)) 
  pred_all_past[t-t0, ] = predict(arx_all_past, newdata = data.frame(ca[t, ]),
                                interval = "prediction", level = 0.95)
  pred_trailing[t-t0, ] = predict(arx_trailing, newdata = data.frame(ca[t, ]),
                                interval = "prediction", level = 0.95)
}

test_arx %>% 
  mutate(observed = deaths, 
         `predicted (CV)` = pred_all_past[, 1], 
         lower = pred_all_past[, 2], 
         upper = pred_all_past[, 3]) %>%
  pivot_longer(cols = c(observed, `predicted (CV)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
  geom_line(aes(col = Deaths)) + 
  labs(title = "ARX: point predictions and intervals (time-series CV)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

test_arx %>% 
  mutate(observed = deaths, 
         `predicted (CV + trailing)` = pred_trailing[, 1], 
         lower = pred_trailing[, 2], 
         upper = pred_trailing[, 3]) %>%
  pivot_longer(cols = c(observed, `predicted (CV + trailing)`), 
               names_to = 'Deaths') %>%
  ggplot(aes(time_value, value)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
  geom_line(aes(col = Deaths)) + 
  labs(title = "ARX: point predictions and intervals (time-series CV)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

Quantile Regression

#install.packages("quantreg")
library(quantreg)
## Warning: package 'quantreg' was built under R version 4.3.3
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 4.3.3
train_rq <- ca %>% filter(time_value <= t0_date)
test_rq <- ca %>% filter(time_value > t0_date)
q_reg = rq(deaths ~ lagged_deaths + lagged_cases, data = train_rq, 
           tau = c(.025, .5, .975))
coef(q_reg)
##                  tau= 0.025   tau= 0.500  tau= 0.975
## (Intercept)   -0.0090324978 0.0032234550 0.004291003
## lagged_deaths  0.9190610331 0.9812695258 1.082715062
## lagged_cases   0.0002464703 0.0001356316 0.000528872

Train/test predictions

Let’s plot the predictions (point estimates and 95% confidence intervals) for the training and test sets.

pred_train <- rbind(matrix(NA, nrow = k, ncol = 3), 
                    predict(q_reg))
pred_test <- predict(q_reg, newdata = test_rq)

preds <- rbind(pred_train, pred_test)

ca %>% 
  mutate(observed = deaths, 
         predicted = preds[, 2],
         lower = preds[, 1],
         upper = preds[, 3]) %>%
  pivot_longer(cols = c(predicted, observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, fill = "#00BFC4") +
  geom_line(aes(col = Deaths)) + 
  geom_vline(xintercept = t0_date, lty = 2) +
  labs(title = "Quantile Regression: point predictions and intervals (train/test)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).

Time-series cross-validation

We will now perform time-series cross-validation fitting the model on:

  • all the past data up to the forecasting time
  • a trailing window of data up to the forecasting time

We set the trailing window to have size = 30, and we perform 1-step-ahead predictions.

n <- nrow(ca)
pred_all_past = pred_trailing <- matrix(NA, nrow = n - t0, ncol = 3)
w <- 30 # trailing window size

for (t in (t0+1):n) {
  rq_all_past = rq(deaths ~ lagged_deaths + lagged_cases, tau = c(.025, .5, .975),
                   data = ca, subset = (1:n) <= (t-1)) 
  rq_trailing = rq(deaths ~ lagged_deaths + lagged_cases, tau = c(.025, .5, .975),
                   data = ca, subset = (1:n) <= (t-1) & (1:n) > (t-1-w)) 
  pred_all_past[t-t0, ] = predict(rq_all_past, newdata = data.frame(ca[t, ]))
  pred_trailing[t-t0, ] = predict(rq_trailing, newdata = data.frame(ca[t, ]))
}

We plot the 1-step-ahead predictions obtained by the model fitted on all past data and on trailing windows.

test_rq %>% 
  mutate(observed = deaths, 
         `predicted (CV)` = pred_all_past[, 2],
         lower = pred_all_past[, 1],
         upper = pred_all_past[, 3]) %>%
  pivot_longer(cols = c(`predicted (CV)`, 
                        observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, 
              fill = hue_pal()(2)[2]) +
  geom_line(aes(col = Deaths)) + 
  geom_vline(xintercept = t0_date, lty = 2) +
  labs(title = "Quantile Regression: point predictions and intervals (time-series CV)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())

test_rq %>% 
  mutate(observed = deaths, 
         `predicted (trailing + CV)` = pred_trailing[, 2], 
          lower = pred_trailing[, 1],
          upper = pred_trailing[, 3]) %>%
  pivot_longer(cols = c(`predicted (trailing + CV)`, 
                        observed), names_to = 'Deaths') %>%
  ggplot(aes(time_value, value)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .4, 
              fill = hue_pal()(2)[2]) +
  geom_line(aes(col = Deaths)) + 
  geom_vline(xintercept = t0_date, lty = 2) +
  labs(title = "Quantile Regression: point predictions and intervals (time-series CV)", 
       x = "", y = "Covid-19 deaths per 100k people") +
  theme(legend.position = "bottom", legend.title = element_blank())


AFTERNOON

LASSO

ar_lasso <- arx_forecaster(epi_data = train %>% as_epi_df(), 
                           outcome = "deaths", 
                           predictors = "deaths", 
                           trainer = linear_reg(penalty = double(1), 
                                                mixture = double(1)) %>% 
                             set_engine("glmnet") %>%
                             #set_engine("spark") %>% 
                             translate(),
                           args_list = arx_args_list(lags = c(0L, 7L, 14L), 
                                                     ahead = 1L))

ar_lasso_fit <- extract_fit_parsnip(ar_lasso$epi_workflow)
#ar_lasso_fit